package cn.edu.hfut.dmic.webcollector.example;

import java.io.File;
import java.io.IOException;

import junit.framework.Assert;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;

import cn.edu.hfut.dmic.webcollector.souplang.Parser;

public class test {

	public static void main(String[] args) {
		
		File input = new File("/Users/crazypg/webdata/1.html");
		Document doc;
		
		try {
			doc = Jsoup.parse(input, "UTF-8", "http://example.com/");

			Elements links = doc.select("a[href]"); // a with href
			Elements pngs = doc.select("img[src$=.png]");
			  // img with src ending .png
			Element masthead = doc.select("div.masthead").first();
			  // div with class=masthead
			Elements resultLinks = doc.select("h3.r > a"); // direct a after h3
			Elements comment = doc.select("p.comment_txt");
			Elements script = doc.select("script");
			
			System.out.println(script.size());
			
			for(int i=22;i>0;i--){
				System.out.println(i+"="+script.get(i).data().length());
			}
			System.out.println("="+script.get(16).data());
			


			/*HtmlUnit请求web页面*/  
	        WebClient wc = new WebClient();  
	        wc.getOptions().setJavaScriptEnabled(true); //启用JS解释器，默认为true  
	        wc.getOptions().setCssEnabled(false); //禁用css支持  
	        wc.getOptions().setThrowExceptionOnScriptError(false); //js运行错误时，是否抛出异常  
	        wc.getOptions().setTimeout(10000); //设置连接超时时间 ，这里是10S。如果为0，则无限期等待  
	        HtmlPage page = wc.getPage("http://s.weibo.com/weibo/600010?page=3");  
	        String pageXml = page.asXml(); //以xml的形式获取响应文本  
	  
	        
	        System.out.println("==>"+pageXml);
	        
	        /**jsoup解析文档*/  
	        Document doc2 = Jsoup.parse(pageXml);   
	        Element pv = doc2.select("p.comment_txt").get(1);  
	        System.out.println(pv.text());  
//	        Assert.assertTrue(pv.text().contains("浏览"));  
	  
	        System.out.println("Thank God!");  
	        
			
		} catch (IOException e) {
			e.printStackTrace();
		}
		
	}

}
